Load packages and paths
library(grid)
library(ggplot2)
library(plyr)
library(RColorBrewer)
CMI Recommendations
cmi_main_blue = "#0071b2"
cmi_grey = "#929d9e"
cmi_light_blue = "#00c4d9"
cmi_pea_green = "#b5bf00"
cmi_rich_green = "#73933d"
cmi_rich_purple = "#8e7fac"
cmi_rich_red = "#d75920"
cmi_rich_blue = "#4c87a1"
cmi_rich_aqua = "#66c7c3"
cmi_rich_orange = "#eebf42"
cmi_vibrant_yellow = "#ffd457"
cmi_vibrant_orange = "#f58025"
cmi_vibrant_green = "#78a22f"
cmi_vibrant_garnet = "#e6006f"
cmi_vibrant_purple = "#9A4d9e"
cmi_vibrant_blue = "#19398a"
cmi_site_colors = c(cmi_vibrant_blue, cmi_rich_blue, cmi_vibrant_purple, cmi_vibrant_garnet,
cmi_rich_red, cmi_vibrant_orange, cmi_vibrant_yellow, cmi_vibrant_green)
cmi_site_colors_ramp = colorRampPalette(cmi_site_colors)
Read in the data and then some
# setwd('~/zarrar') setwd('~/Dropbox/Research/cmi/')
# setwd('qc/corr.scripts')
# script.dir <- dirname(sys.frame(1)$ofile) setwd(script_dir)
df <- read.csv("../corr.qc/qc_filt_epi_derivatives_smoothed_cpac.csv")[, -1]
nsites <- length(unique(df$site))
In our plots, we want to have percentile lines for each QC measure to indicate the distribution of each site relative to the whole sample
qc.measures <- colnames(df)[!(colnames(df) %in% c("uniqueid", "subject", "subid",
"site", "site.name", "session", "scan", "global"))]
qvals <- c(0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99)
qcat <- c(1, 5, 25, 50, 25, 5, 1)
qline <- c(3, 2, 5, 1, 5, 2, 3)
qsize <- c(0.4, 0.25, 0.3, 0.25, 0.3, 0.25, 0.4)
qcols <- c("grey10", "grey10", "grey10", "grey50", "grey10", "grey10", "grey10")
# qcols <- brewer.pal(8, 'Dark2')[c(1,2,3,4,3,2,1)]
Now let's get the percentiles
percentiles <- apply(subset(df, select = qc.measures), 2, quantile, qvals, na.rm = TRUE)
percentiles <- as.data.frame(cbind(percentiles, qcat, qline, qsize))
percentiles$qline <- as.factor(qline)
percentiles$qcat <- as.factor(qcat)
print(percentiles)
## falff_50 falff_75 falff_90 falff_fwhm falff_mean reho_50 reho_75
## 1% 0.3838 0.4038 0.4280 6.459 0.3813 0.05749 0.07110
## 5% 0.4945 0.5254 0.5559 6.782 0.4880 0.06762 0.08962
## 25% 0.6051 0.6294 0.6564 7.093 0.5955 0.09342 0.12988
## 50% 0.6300 0.6650 0.6985 7.284 0.6259 0.11382 0.15979
## 75% 0.6716 0.7058 0.7381 7.591 0.6625 0.15200 0.20972
## 95% 0.7372 0.7554 0.7763 8.171 0.7182 0.21227 0.28739
## 99% 0.7479 0.7710 0.7964 8.700 0.7306 0.24326 0.32237
## reho_90 reho_fwhm reho_mean vmhc_50 vmhc_75 vmhc_90 vmhc_fwhm
## 1% 0.08866 9.031 0.06265 0.1776 0.3422 0.5338 5.393
## 5% 0.11621 10.975 0.07593 0.2355 0.4187 0.6079 5.803
## 25% 0.16977 13.115 0.10428 0.3217 0.5170 0.6864 6.255
## 50% 0.20697 14.113 0.12494 0.3919 0.5826 0.7375 6.465
## 75% 0.26642 14.890 0.16334 0.4566 0.6427 0.7841 6.649
## 95% 0.35875 15.938 0.22444 0.5579 0.7276 0.8455 6.900
## 99% 0.39790 16.975 0.25374 0.6078 0.7767 0.8755 7.081
## vmhc_mean qcat qline qsize
## 1% 0.2136 1 3 0.40
## 5% 0.2625 5 2 0.25
## 25% 0.3293 25 5 0.30
## 50% 0.3801 50 1 0.25
## 75% 0.4260 25 5 0.30
## 95% 0.4957 5 2 0.25
## 99% 0.5303 1 3 0.40
Associate a detailed description with each measure
dnames <- c("fALFF", "REHO", "VMHC")
mnames <- c("Median", "75th Percentile", "90th Percentile", "FWHM (mm)", "Mean")
descs <- expand.grid(m = mnames, d = dnames)
descs <- paste(descs[, 2], descs[, 1], sep = " - ")
mdf <- data.frame(measure = qc.measures, description = descs)
print(mdf)
## measure description
## 1 falff_50 fALFF - Median
## 2 falff_75 fALFF - 75th Percentile
## 3 falff_90 fALFF - 90th Percentile
## 4 falff_fwhm fALFF - FWHM (mm)
## 5 falff_mean fALFF - Mean
## 6 reho_50 REHO - Median
## 7 reho_75 REHO - 75th Percentile
## 8 reho_90 REHO - 90th Percentile
## 9 reho_fwhm REHO - FWHM (mm)
## 10 reho_mean REHO - Mean
## 11 vmhc_50 VMHC - Median
## 12 vmhc_75 VMHC - 75th Percentile
## 13 vmhc_90 VMHC - 90th Percentile
## 14 vmhc_fwhm VMHC - FWHM (mm)
## 15 vmhc_mean VMHC - Mean
cat("CHECK ABOVE...do the columns for each row match\n")
## CHECK ABOVE...do the columns for each row match
This function will add percentile lines in the background plot: ggplot object pdf: percentile data frame
compile_percentiles <- function(pdf, measure, cols = NULL) {
ret <- lapply(1:nrow(pdf), function(i) {
p <- pdf[i, ]
if (!is.null(cols)) {
plot <- geom_hline(aes_string(yintercept = measure), data = p, size = as.numeric(p$qsize),
linetype = as.numeric(p$qline), color = cols[i])
# as.character(p$qcolor[1])
} else {
plot <- geom_hline(aes_string(yintercept = measure), data = p, size = as.numeric(p$qsize[1]),
linetype = as.numeric(p$qline[1]), color = "grey50")
}
return(plot)
})
return(ret)
}
Sometimes extreme data-points can skew the plot and make it difficult to see the spread of the data.
I will avoid plotting those outlier points by setting the axis to only the points that I want. ggplot will complain but let her (poor baby).
# functions
range.outlier.iqr <- function(x, times = 3) {
upper.limit <- quantile(x, 0.75) + times * IQR(x)
lower.limit <- quantile(x, 0.25) - times * IQR(x)
return(c(lower.limit, upper.limit))
}
outlier.iqr <- function(x, times = 3) {
tmp <- range.outlier.iqr(x, times)
lower.limit <- tmp[1]
upper.limit <- tmp[2]
return((x > upper.limit) | (x < lower.limit))
}
# outlier values (if any)
lst.outlier.iqr <- llply(qc.measures, function(measure) {
ret <- subset(df, select = c("subject", "site", "site.name", "session",
"scan", measure))
inds <- outlier.iqr(df[[measure]])
return(ret[inds, ])
})
names(lst.outlier.iqr) <- qc.measures
# new ranges of our plots (sans outliers)
df.range.iqr <- as.data.frame(sapply(qc.measures, function(m) {
inds <- !outlier.iqr(df[[m]])
range(df[[m]][inds]) * c(0.99, 1.01)
}))
A function with all the theme jazz
set_themes <- function(family = "Times", text.size.x = 14, text.size.y = 16,
title.size = 18) {
family <- "sans"
pg <- list(theme_bw(), theme(axis.title.x = element_text(family = family,
face = "plain", size = title.size)), theme(axis.title.y = element_text(family = family,
face = "plain", size = title.size, angle = 90, vjust = 0.25)), theme(axis.text.x = element_text(family = family,
face = "plain", size = text.size.x, vjust = 0.5, angle = 45)), theme(axis.text.y = element_text(family = family,
face = "plain", size = text.size.y, angle = 90)), theme(axis.ticks.length = unit(0.15,
"lines")), theme(axis.ticks.margin = unit(0.15, "lines")), theme(plot.margin = unit(c(0.25,
1, 0.25, 1), "lines")), theme(legend.position = "none"))
return(pg)
}
I will be plotting a bunch of different data-sets here. First I will have all the data, then I will have it remote 3 x the IQR. So two sets of the same plots.
for (i in 1:nrow(mdf)) {
measure <- as.character(mdf$measure[i])
desc <- as.character(mdf$description[i])
# ### Option 1 First, I'll plot ones with 1%, 5%, 25%, and 50% percentile
# lines
pg1 = ggplot(df, aes_string(x = "site.name", y = measure))
# Add those percentile lines
pg2 = pg1 + compile_percentiles(percentiles, measure, qcols)
# Add main plot - violin plot + boxplot for all the data - jitter plot for
# each site (adjust the color) - x and y labels
pg3 = pg2 + geom_violin(aes(x = global), color = "gray50") + geom_boxplot(aes(x = global),
width = 0.1, fill = "gray50", outlier.size = 0) + geom_jitter(aes(color = site.name),
position = position_jitter(width = 0.1)) + scale_color_manual(values = c(brewer.pal(4,
"Dark2"), cmi_site_colors_ramp(nsites))) + ylab(desc) + xlab("")
# Add the y-range limit
pg4 = pg3
pg4 = pg4 + ylim(df.range.iqr[[measure]])
# Below assumes that you are doing this with a default axis (sites on x,
# data on y)
pg5 = pg4 + set_themes()
# Plot
pg = pg5
print(pg)
# ggsave('plot_option02.png', pg, height=2.5, width=5)
# readline('continue?')
cat("\n\n\n\n")
}
## Warning: Removed 122 rows containing non-finite values (stat_ydensity).
## Warning: Removed 122 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 122 rows containing missing values (geom_point).
## Warning: Removed 19 rows containing non-finite values (stat_ydensity).
## Warning: Removed 19 rows containing non-finite values (stat_boxplot).
## Warning: Removed 19 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing non-finite values (stat_ydensity).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 13 rows containing non-finite values (stat_ydensity).
## Warning: Removed 13 rows containing non-finite values (stat_boxplot).
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 86 rows containing non-finite values (stat_ydensity).
## Warning: Removed 86 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 86 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_ydensity).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
for (i in 1:nrow(mdf)) {
measure <- as.character(mdf$measure[i])
desc <- as.character(mdf$description[i])
# ### Option 1 First, I'll plot ones with 1%, 5%, 25%, and 50% percentile
# lines
pg1 = ggplot(df, aes_string(x = "site.name", y = measure))
# Add those percentile lines
pg2 = pg1 + compile_percentiles(percentiles, measure, qcols)
# Add main plot - violin plot + boxplot for all the data - jitter plot for
# each site (adjust the color) - x and y labels
pg3 = pg2 + geom_violin(aes(x = global), color = "gray50") + geom_boxplot(aes(x = global),
width = 0.1, fill = "gray50", outlier.size = 0) + geom_jitter(aes(color = site.name),
position = position_jitter(width = 0.1)) + scale_color_manual(values = c(brewer.pal(4,
"Dark2"), cmi_site_colors_ramp(nsites))) + ylab(desc) + xlab("")
# Add the y-range limit and the outlier points on the maximum of the range
# only if there are any outliers
pg4 = pg3
# pg4=pg4 + ylim(df.range.iqr[[measure]])
# Below assumes that you are doing this with a default axis (sites on x,
# data on y)
pg5 = pg4 + set_themes()
# Plot
pg = pg5
print(pg)
# ggsave('plot_option02.png', pg, height=2.5, width=5)
# readline('continue?')
cat("\n\n\n\n")
}
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).